#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from newspaper import Article

import bs4
import requests

URL = "http://ent.sina.com.cn/"

import ezwork

def check(url):
    return 'article' in url

urls = ezwork.get_urls(URL, check=check)

import jieba, jieba.analyse

punc = {"\n", " ", "，",'。','“','”', '、', '(', ')', '：'}
stop = {"的", "着", "是", "地","着"}

words = []
# urls =['http://3g.163.com/tech/article/E2BFIO9V00097U7S.html']
for url in urls:
    news = Article(url, language='zh')
    news.download()
    news.parse()

    # jieba.analyse.set_stop_words("stop_words.txt")

    text = news.text
    keys = jieba.analyse.extract_tags(text, topK=15)
    words.extend([w for w in jieba.cut(text) if w in keys])

import collections

c = collections.Counter(words)

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

myfont = FontProperties(fname='/System/Library/Fonts/PingFang.ttc')
x = c.keys()
y = c.values()

x = sorted(x, key=lambda x: c[x], reverse=True)
y = [c[xi] for xi in x]

plt.xticks(np.arange(15), x[:15], fontproperties=myfont)
plt.stem(y[:15])
plt.show()
